library(tidyverse)
library(tidycensus)
library(sf)
library(gt)
library(mapview)
# Make sure to read the GEOID as a char!
# Run `get_towns_in_region.R` to get this table.
# We need this for a lot of our work, so we'll load it up early!
towns_in_MAPC <- read_csv(
"data/processed/towns_in_MAPC_BRMPO.csv",
col_types = list(col_character()))
We downloaded data from the Fatality
Analysis Reporting System (FARS) FTP site for the years 2016 through
2020. We then filtered a specific table within those files
(accident.csv) for those crashes that occurred in
Massachusetts. We combined those files together for the entire state. We
verified checked cities that didn’t appear didn’t have fatal crashes
(Wenham, Hamilton, Norfolk, Rockport, and Essex), and modified several
town names to make them match other datasets. We start from that
simplified file. This operation was done in
build_FARS_dataset.rmd).
acc_ma_16_20 <- read_csv("./data/processed/fatal_crashes_ma_2016_2020.csv", show_col_types = FALSE) |>
select(STATE, COUNTY, COUNTYNAME, CITY, CITYNAME, FATALS, fars_year, city_join)
We can group the data together by municipality and sum the data to find the 5-year statewide fatality totals. We’ll then need to pare it back to the MPO region.
# Find the fatalities by muni.
acc_bymuni <- acc_ma_16_20 |>
group_by(STATE, COUNTYNAME, city_join) |>
summarize(total_fatals_16_20 = sum(FATALS))
# Attach whether the community is in MAPC/BRMPO.
acc_MAPC <- acc_bymuni |>
full_join(towns_in_MAPC |>
mutate(NAME_short = toupper(NAME_short)),
by = c("city_join" = "NAME_short")) |>
filter(!is.na(GEOID)) |>
mutate(total_fatals_16_20 = replace_na(total_fatals_16_20, 0))
# Summarize it for the whole region and calculate the rate.
acc_MAPC_summ <- acc_MAPC |>
group_by(RPA, MPO) |>
summarize(total_fatals_16_20 = sum(total_fatals_16_20)) |>
mutate(annual_fatal_rate = total_fatals_16_20 / 5)
# And report the results.
acc_MAPC_summ |>
ungroup() |>
janitor::adorn_totals() |>
gt() |>
tab_header(title = "Fatal Crashes in MAPC Communities",
subtitle = "as Reported in the 2016-2020 FARS") |>
fmt_number(columns = total_fatals_16_20, decimals = 0) |>
fmt_number(columns = annual_fatal_rate, decimals = 1) |>
cols_label(total_fatals_16_20 = "Total Fatalities (5 Years)") |>
cols_label(annual_fatal_rate = "Average Annual Fatalities") |>
tab_source_note(source_note = "Note: RPA = Regional Planning Agency. MPO = Metropolitan Planning Organization. MAPC = Metropolitan Area Planning Council. OCPC = Old Colony Planning Council. FARS = Fatality Analysis Reporting System") |>
tab_source_note(source_note = "Source: 2016-2020 FARS Accident Tables (FATALS)")
| Fatal Crashes in MAPC Communities | |||
|---|---|---|---|
| as Reported in the 2016-2020 FARS | |||
| RPA | MPO | Total Fatalities (5 Years) | Average Annual Fatalities |
| MAPC | Boston | 542 | 108.4 |
| MAPC/OCPC | Old Colony | 15 | 3.0 |
| Total | - | 557 | 111.4 |
| Note: RPA = Regional Planning Agency. MPO = Metropolitan Planning Organization. MAPC = Metropolitan Area Planning Council. OCPC = Old Colony Planning Council. FARS = Fatality Analysis Reporting System | |||
| Source: 2016-2020 FARS Accident Tables (FATALS) | |||
Information from MassDOT’s
Impact Portal suggest that 534 fatalities occurred in the MAPC
region:
This value is relatively close to the FARS-derived values.
We will use the census API to download population data from the 2020 US Census.
# vars <- tidycensus::load_variables(year = 2020, dataset = "pl")
# Download our towns from the census.
ma_towns <- get_decennial(
geography = "county subdivision",
variables = "P1_001N",
year = 2020,
state = "MA",
geometry = TRUE,
cb = FALSE
)
MAPC_towns <- ma_towns |> left_join(towns_in_MAPC, by = "GEOID") |>
filter(str_detect(RPA, "MAPC"))
Let’s make a quick map to check if that looks right.
MAPC_towns |> mapview(zcol = "RPA")
pop_MAPC_summ <- MAPC_towns |>
st_drop_geometry() |>
group_by(RPA, MPO) |>
summarize(total_pop = sum(value)) |>
ungroup()
pop_MAPC_summ |>
janitor::adorn_totals() |>
gt() |>
tab_header(title = "Population of MAPC and the Boston MPO",
subtitle = "Based on the 2020 U.S. Census") |>
fmt_number(columns = total_pop, decimals = 0) |>
cols_label(total_pop = "Total Population") |>
tab_source_note(source_note = "Note: RPA = Regional Planning Agency. MPO = Metropolitan Planning Organization. MAPC = Metropolitan Area Planning Council. OCPC = Old Colony Planning Council") |>
tab_source_note(source_note = "Source: 2020 U.S. Census: P.L. 94-171, Table: P1 (Total Population, P1_001N)")
| Population of MAPC and the Boston MPO | ||
|---|---|---|
| Based on the 2020 U.S. Census | ||
| RPA | MPO | Total Population |
| MAPC | Boston | 3,357,194 |
| MAPC/OCPC | Old Colony | 78,565 |
| Total | - | 3,435,759 |
| Note: RPA = Regional Planning Agency. MPO = Metropolitan Planning Organization. MAPC = Metropolitan Area Planning Council. OCPC = Old Colony Planning Council | ||
| Source: 2020 U.S. Census: P.L. 94-171, Table: P1 (Total Population, P1_001N) | ||
Now we can merge the fatalities and the population to estimate the fatality rate.
fatal_rate <- left_join(acc_MAPC_summ,
pop_MAPC_summ,
by = c("RPA", "MPO")) |>
janitor::adorn_totals() |>
mutate(
fatal_per_100k = annual_fatal_rate/total_pop * 100000)
fatal_rate |> gt() |>
tab_header(title = "Fatality Rate in MAPC Communities",
subtitle = "Based on the 2020 U.S. Census and 2016-2020 FARS data") |>
fmt_number(columns = total_pop, decimals = 0) |>
fmt_number(columns = total_fatals_16_20, decimals = 0) |>
fmt_number(columns = annual_fatal_rate, decimals = 1) |>
fmt_number(columns = fatal_per_100k, decimals = 2) |>
cols_label(total_fatals_16_20 = "Total Fatalities (5 Years)") |>
cols_label(annual_fatal_rate = "Average Annual Fatalities") |>
cols_label(total_pop = "Total Population") |>
cols_label(fatal_per_100k = "5-Year Average Fatality Rate (per 100,000 persons)") |>
tab_source_note(source_note = "Note: RPA = Regional Planning Agency. MPO = Metropolitan Planning Organization. MAPC = Metropolitan Area Planning Council. OCPC = Old Colony Planning Council. FARS = Fatality Analysis Reporting System") |>
tab_source_note(source_note = "Population source: 2020 U.S. Census: P.L. 94-171, Table: P1 (Total Population, P1_001N)") |>
tab_source_note(source_note = "Fatality source: 2016-2020 FARS Accident Tables (FATALS)")
| Fatality Rate in MAPC Communities | |||||
|---|---|---|---|---|---|
| Based on the 2020 U.S. Census and 2016-2020 FARS data | |||||
| RPA | MPO | Total Fatalities (5 Years) | Average Annual Fatalities | Total Population | 5-Year Average Fatality Rate (per 100,000 persons) |
| MAPC | Boston | 542 | 108.4 | 3,357,194 | 3.23 |
| MAPC/OCPC | Old Colony | 15 | 3.0 | 78,565 | 3.82 |
| Total | - | 557 | 111.4 | 3,435,759 | 3.24 |
| Note: RPA = Regional Planning Agency. MPO = Metropolitan Planning Organization. MAPC = Metropolitan Area Planning Council. OCPC = Old Colony Planning Council. FARS = Fatality Analysis Reporting System | |||||
| Population source: 2020 U.S. Census: P.L. 94-171, Table: P1 (Total Population, P1_001N) | |||||
| Fatality source: 2016-2020 FARS Accident Tables (FATALS) | |||||
We can go to the Areas of Persistent Poverty Project (APP) and Historically Disadvantaged Community (HDC) Status Tool referenced in the NOFO to obtain underserved census tracts.
The data needs to be cleaned before it’s joined to population data. We start by using the 2010 Census geography as that matches the format of the downloaded data. We’ll identify the places that are DACs using the 2010 geography, then we’ll handle bringing that to 2020 when we have it a little more manageable. Census tracts split and merge over time as the population changes. The Census Bureau likes to keep tracts around 4,000 people.
underserved <- read_csv("./data/base/RAISE_Persistent_Poverty.csv") |>
janitor::clean_names() |>
rename(state = a_state,
county = b_county,
tract_name = c_census_tract_name,
app_cnty = e_app_county_meets_definition,
app_tract = f_app_census_tract_meets_definition,
hdc_tract = g_hdc_census_tract_meets_definition) |>
mutate(app_cnty = recode(app_cnty , "No" = 0, "Yes" = 1, "Not Identified" = -1),
app_tract = recode(app_tract, "No" = 0, "Yes" = 1, "Not Identified" = -1),
hdc_tract = recode(hdc_tract, "No" = 0, "Yes" = 1, "Not Identified" = -1))
underserved_filter <- underserved |>
filter(state == "Massachusetts") |>
mutate(underserved = app_cnty == 1 | app_tract == 1 | hdc_tract == 1)
# The detailed census tract data comes with similar information attached.
# We can use that information to build the join. First we need to download the full data.
# It appears that the definition file used by USDOT for the DACs was based on the 2010 Census
# geography.
ma_tracts_10 <- get_decennial(
geography = "tract",
variables = "P001001",
year = 2010,
state = "MA",
geometry = TRUE,
cb = FALSE,
keep_geo_vars = TRUE)
# Just pulling this into a seperate variable so we don't have
# to go grab the data again if we need to fix things
ma_tracts_10_mod <- ma_tracts_10 |>
separate(NAME, into = c("tract_name", "county_name", "state_name"), sep = ",") |>
mutate(tract_name = trimws(tract_name),
county_name = trimws(county_name)) |>
left_join(underserved_filter,
by = c("tract_name" = "tract_name",
"county_name" = "county")
)
# Make the dataset a little smaller. This isn't strictly necessary.
ma_tracts_10_mod <- ma_tracts_10_mod |>
filter(county_name %in% c("Plymouth County", "Norfolk County", "Worcester County",
"Middlesex County", "Essex County", "Suffolk County")) |>
filter(underserved == TRUE)
We clip the dataset to contain just those that are within the
region’s town boundaries. I was having trouble getting the intersect
feature to work as is–I suspect it has something to do with the borders
of tracts being counted as an intersection. st_covered_by
seems to work though. Without buffering it by 10m, this missed a couple
of tracts. This method needs some work to make it more stable.
underserved_inMAPC <- st_join(ma_tracts_10_mod,
MAPC_towns |> group_by(RPA, MPO) |> summarize() |> st_buffer(10),
join = st_covered_by) |>
filter(str_detect(RPA, "MAPC")) |>
st_transform(st_crs(ma_tracts_10_mod))
underserved_inMAPC |>
select(GEOID10, app_cnty, app_tract, hdc_tract,RPA, MPO) |>
st_transform(4326) |>
st_write("./data/processed/underserved_inMAPC_2010CensusTracts.shp",
delete_dsn = TRUE)
## Deleting source `./data/processed/underserved_inMAPC_2010CensusTracts.shp' failed
## Writing layer `underserved_inMAPC_2010CensusTracts' to data source
## `./data/processed/underserved_inMAPC_2010CensusTracts.shp' using driver `ESRI Shapefile'
## Writing 121 features with 6 fields and geometry type Multi Polygon.
underserved_inMAPC |>
select(GEOID10, app_cnty, app_tract, hdc_tract,RPA, MPO) |>
st_transform(4326) |>
st_write("./data/processed/underserved_inMAPC_2010CensusTracts.geojson",
delete_dsn = TRUE)
## Deleting source `./data/processed/underserved_inMAPC_2010CensusTracts.geojson' failed
## Writing layer `underserved_inMAPC_2010CensusTracts' to data source
## `./data/processed/underserved_inMAPC_2010CensusTracts.geojson' using driver `GeoJSON'
## Writing 121 features with 6 fields and geometry type Multi Polygon.
mapview(ma_tracts_10_mod, col.regions = "black") +
mapview(underserved_inMAPC, col.regions = "red") +
mapview(MAPC_towns, zcol = "MPO")
underserved_inMAPC |>
st_drop_geometry() |>
group_by(RPA, MPO) |>
summarize(DAC_total = sum(value)) |>
janitor::adorn_totals() |>
gt() |>
fmt_number(columns = DAC_total, decimals = 0) |>
cols_label(DAC_total = "DAC Population") |>
tab_header(title = "Population Living in Underserved Census Tracts in Massachusetts (2010 Census)",
subtitle = "Based on the USDOT's definition of disadvantaged communities ('DACs') from the Justice40 Initiative") |>
tab_source_note(source_note = "Note: RPA = Regional Planning Agency. MPO = Metropolitan Planning Organization. MAPC = Metropolitan Area Planning Council. OCPC = Old Colony Planning Council. DAC = Disadvantaged Community") |>
tab_source_note(source_note = "Classification source: Areas of Persistent Poverty Project (APP) and Historically Disadvantaged Community (HDC) Status Tool: https://datahub.transportation.gov/stories/s/tsyd-k6ij") |>
tab_source_note(source_note = "Population source: 2010 US Census: SF1, P001001 (Total Population)")
| Population Living in Underserved Census Tracts in Massachusetts (2010 Census) | ||
|---|---|---|
| Based on the USDOT's definition of disadvantaged communities ('DACs') from the Justice40 Initiative | ||
| RPA | MPO | DAC Population |
| MAPC | Boston | 490,109 |
| MAPC/OCPC | Old Colony | 1,753 |
| Total | - | 491,862 |
| Note: RPA = Regional Planning Agency. MPO = Metropolitan Planning Organization. MAPC = Metropolitan Area Planning Council. OCPC = Old Colony Planning Council. DAC = Disadvantaged Community | ||
| Classification source: Areas of Persistent Poverty Project (APP) and Historically Disadvantaged Community (HDC) Status Tool: https://datahub.transportation.gov/stories/s/tsyd-k6ij | ||
| Population source: 2010 US Census: SF1, P001001 (Total Population) | ||
That’s useful to know, but it’s still the wrong year. We need to go figure out how to deal with the changes between 2010 and 2020.
ma_tracts_20 <- get_decennial(
geography = "tract",
variables = "P1_001N",
year = 2020,
state = "MA",
geometry = TRUE,
cb = FALSE,
keep_geo_vars = TRUE)
mapview(ma_tracts_20, col.regions = "grey50") + mapview(underserved_inMAPC, col.regions = "red")
Hmm. So our options are to do some sort of allocation based on area
or perhaps population. tidycensus::interpolate_pw() might
have what we need. Or we can basically say, we’ll take the tracts that
essentially match up with what they were trying to do with the initial
dataset. We can say any tract that’s MOSTLY covered by one of
the tracts will be included. Then we can take a peek and see if it looks
good. There aren’t many non-1:1 relations, so this really has a very
limited effect on the results.
# We'll start by selecting tracts that touch our tracts. It'll make things quicker.
ma_tracts_20_nearby <- st_intersection(ma_tracts_20, underserved_inMAPC) |>
st_set_geometry(NULL) |>
select(GEOID) |>
distinct() |>
pull()
# No idea why tidyverse isnt working here. But we want the area of each tract.
# We'll use this to figure out which 2020 tracts to pull in.
ma_tracts_20_near <- ma_tracts_20 |>
filter(GEOID %in% ma_tracts_20_nearby)
# Find the area of the full 2020 Census tracts. This will be our total area.
ma_tracts_20_near$area_tract <- st_area(ma_tracts_20_near)
mapview(ma_tracts_20_near) + mapview(underserved_inMAPC, col.regions = "red")
# Intersect the underserved areas from 2010. We're going to use this intersection
# to figure out the proportion of each new tract that's close to the underserved areas.
ma_tracts_20_filter <- st_intersection(ma_tracts_20_near,
underserved_inMAPC |> select(GEOID10 = GEOID)) |>
st_make_valid() # This seems important with some odd geometry that results from that.
ma_tracts_20_filter$area_int <- st_area(ma_tracts_20_filter)
# Let's select a cutoff and see how we do. We call it filter2 because it's a pain
# if we messed up and have to go do that expensive intersection a second time.
# I'd rather use a little extra memory than wait around.
ma_tracts_20_filter2 <- ma_tracts_20_filter |>
group_by(GEOID) |>
mutate(pct_area = units::drop_units(area_int / area_tract)) |>
ungroup() |>
st_make_valid() |>
filter(pct_area > 0.2) |>
select(GEOID) |>
st_drop_geometry() |>
distinct() |> # No need for duplicates here!
pull()
Ok! Now all that’s left to do is see how that affects our 2020 Census selection. The left maps show the 2020 tracts and the right map show how that compares to the 2010 geography.
m1 <- mapview(ma_tracts_20, col.regions = "grey80", lwd = 1.5) +
mapview(ma_tracts_20 |> filter(GEOID %in% ma_tracts_20_filter2)) +
mapview(underserved_inMAPC, col.regions = "red")
m2 <- mapview(ma_tracts_10, col.regions = "grey80", lwd = 1.5) +
mapview(ma_tracts_20 |> filter(GEOID %in% ma_tracts_20_filter2)) +
mapview(underserved_inMAPC, col.regions = "red")
leafsync::sync(m1,m2)